library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(readr)
library(viridis)
## Loading required package: viridisLite
library(leaflet)
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 8,
fig.height = 6,
out.width = "90%"
)
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
theme_set(theme_minimal() + theme(legend.position = "bottom"))
data_2018 =
read_csv("./data/2018data.csv") %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## .default = col_character(),
## TIME = col_time(format = ""),
## `ZIP CODE` = col_double(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## `NUMBER OF PERSONS INJURED` = col_double(),
## `NUMBER OF PERSONS KILLED` = col_double(),
## `NUMBER OF PEDESTRIANS INJURED` = col_double(),
## `NUMBER OF PEDESTRIANS KILLED` = col_double(),
## `NUMBER OF CYCLIST INJURED` = col_double(),
## `NUMBER OF CYCLIST KILLED` = col_double(),
## `NUMBER OF MOTORIST INJURED` = col_double(),
## `NUMBER OF MOTORIST KILLED` = col_double(),
## COLLISION_ID = col_double()
## )
## See spec(...) for full column specifications.
newnames = colnames(data_2018) %>%
str_replace("number_of_","")
names(data_2018) = newnames
tidy_data =
data_2018 %>%
mutate(
date_complete = date
) %>%
separate(date, into = c("month", "day", "year"), sep = "/") %>%
separate(time, into = c("hour", "minute"), sep = ":") %>%
select(-zip_code, -location, -on_street_name, -cross_street_name, -off_street_name,-collision_id,-year) %>%
rename("vehicle_type" = "vehicle_type_code_1") %>%
mutate( day = as.numeric(day),
month = as.numeric(month),
hour = as.numeric(hour),
minute = as.numeric(minute),
latitude = replace_na(latitude,0),
vehicle_type = str_to_lower(vehicle_type)
) %>%
filter( latitude != 0)
Vehicle type
vehicle_type_data =
tidy_data %>%
mutate(
vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"truck"),"truck"),
vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"sport utility"),"sport utility vehicle")
) %>%
filter( vehicle_type %in% c("taxi","passenger vehicle","truck","sport utility vehicle")) %>%
group_by(vehicle_type,hour) %>%
summarize(
n = n()
)
vehicle_type_data %>%
plot_ly(
x = ~hour, y = ~n, color = ~vehicle_type, type = "scatter", mode = "line") %>%
layout(
title = "Collisions of Day for Different Vehicles",
xaxis = list(title = "Hour of Day"),
yaxis = list(title = "Collisions")
)
Top 8 Collision Reasons
reason_data =
tidy_data %>%
group_by(contributing_factor_vehicle_1) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
head(10)
reason_data %>%
plot_ly(x = ~reorder(contributing_factor_vehicle_1,desc(n)), y = ~n, color = ~contributing_factor_vehicle_1 ,type = "bar") %>%
layout(
title = "The Number of Items Ordered in Each Aisle",
xaxis = list(title = "Different Reasons"),
yaxis = list(title = "Count")
)
Mapping
data_2018 = tidy_data
data_2018 = rename(data_2018, long = latitude, lat = longitude)
pal <- colorNumeric(
palette = "viridis",
domain = data_2018$persons_injured)
data_2018 %>%
filter(!(lat < "-70" | lat >= "-75")) %>%
filter(persons_injured > 2) %>%
mutate(
label = str_c("<b>vehicle type: ", vehicle_type, "</b><br>Month: ", month , sep = "") ) %>%
sample_n(2000) %>%
leaflet() %>%
addTiles() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addLegend("bottomright", pal = pal, values = ~persons_injured,
title = "Persons Injured",
opacity = 1
) %>%
addCircleMarkers(
~lat, ~long,
color = ~pal(persons_injured),
radius = 0.5,
popup = ~ label)
data_2018 %>%
group_by(borough) %>%
summarise(n())
## # A tibble: 6 x 2
## borough `n()`
## <chr> <int>
## 1 BRONX 22121
## 2 BROOKLYN 46314
## 3 MANHATTAN 29728
## 4 QUEENS 40400
## 5 STATEN ISLAND 5988
## 6 <NA> 71583
data_2018_seperate = tidy_data
data_kill_injured = data_2018_seperate %>%
select(month, persons_injured,persons_killed, pedestrians_injured, pedestrians_killed, cyclist_injured, cyclist_killed, motorist_injured, motorist_killed)
data_kill_injured$injured = apply(data_kill_injured[,c(2,4,6,8)],1,sum,na.rm=T)
data_kill_injured$killed = apply(data_kill_injured[,c(3,5,7,9)],1,sum,na.rm=T)
data_kill_injured =
data_kill_injured %>%
group_by(month) %>%
summarise(
sum_injured = sum(injured),
sum_killed = sum(killed)
) %>%
ungroup()
data_kill_injured = data_kill_injured %>%
pivot_longer(
sum_injured:sum_killed,
names_to = "type",
values_to = "number"
)
plot_kill_injured = data_kill_injured %>%
ggplot(aes(x = month, y = number, color = type))+
geom_point()+
geom_line()+
scale_x_continuous(breaks=seq(1, 12, 1),
labels = c("Jan", "Feb", "Mar", "Apr", "May",
"Jun","Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"))+
scale_y_continuous(breaks = seq(0,11000,1000))+
labs(
title = "Trend of People being Injured or Killed through the Year")+
theme(axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5,color ="Blue"))
plot_kill_injured = ggplotly(plot_kill_injured)
plot_kill_injured
data_kill_injured_day = data_2018_seperate %>%
select(date_complete, persons_injured,persons_killed, pedestrians_injured, pedestrians_killed, cyclist_injured, cyclist_killed, motorist_injured, motorist_killed)
data_kill_injured_day$injured = apply(data_kill_injured_day[,c(2,4,6,8)],1,sum,na.rm=T)
data_kill_injured_day$killed = apply(data_kill_injured_day[,c(3,5,7,9)],1,sum,na.rm=T)
data_kill_injured_day =
data_kill_injured_day%>%
group_by(date_complete) %>%
summarise(
sum_injured = sum(injured),
sum_killed = sum(killed)
) %>%
ungroup()
data_kill_injured_day = data_kill_injured_day %>%
pivot_longer(
sum_injured:sum_killed,
names_to = "type",
values_to = "number"
) %>%
mutate(
day = rep(1:365, each = 2),
month = rep(1:12,c(62,56,62,60,62,60,62,62,60,62,60,62))
)
plot_kill_injured_day = data_kill_injured_day %>%
ggplot(aes(x = date_complete, y = number, group = type, color = type))+
geom_line()+
theme(axis.text.x = element_blank(),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5,color ="Blue"))+
labs(
title = "Trend of People being Injured or Killed through the Day",
x = "Day of the Year")
plot_kill_injured_day

plot_kill_injured_day = ggplotly(plot_kill_injured_day)
plot_kill_injured_day
```